#Part I

Data Wrangling

In the hackathon a project was proposed to collect data from student video watching, a sample of this data is available in the file video-data.csv.

stid = student id year = year student watched video participation = whether or not the student opened the video watch.time = how long the student watched the video for confusion.points = how many times a student rewatched a section of a video key,points = how many times a student skipped or increased the speed of a video

#Install the 'tidyverse' package or if that does not work, install the 'dplyr' and 'tidyr' packages.

#Load the package(s) you just installed

library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(tidyr)
library(dplyr)

D1 <- read.csv("video-data.csv", header = TRUE)

#Create a data frame that only contains the years 2018
D2 <- filter(D1, year == 2018)

Histograms

#Generate a histogram of the watch time for the year 2018

hist(D2$watch.time)

#Change the number of breaks to 100, do you get the same impression?

hist(D2$watch.time, breaks = 100)

#Cut the y-axis off at 10

hist(D2$watch.time, breaks = 100, ylim = c(0,10))

#Restore the y-axis and change the breaks so that they are 0-5, 5-20, 20-25, 25-35

hist(D2$watch.time, breaks = c(0,5,20,25,35))

Plots

#Plot the number of confusion points against the watch time

plot(D1$confusion.points, D1$watch.time)

#Create two variables x & y
x <- c(1,3,2,7,6,4,4)
y <- c(2,4,2,3,2,4,3)

#Create a table from x & y
table1 <- table(x,y)

#Display the table as a Barplot
barplot(table1)

#Create a data frame of the average total key points for each year and plot the two against each other as a lines

D3 <- D1 %>% group_by(year) %>% summarise(mean_key = mean(key.points))
## `summarise()` ungrouping output (override with `.groups` argument)
plot(D3$year, D3$mean_key, type = "l", lty = "dashed")

#Create a boxplot of total enrollment for three students
D4 <- filter(D1, stid == 4|stid == 20| stid == 22)
#The drop levels command will remove all the schools from the variable with no data  
D4 <- droplevels(D4)
boxplot(D4$watch.time~D4$stid, xlab = "Student", ylab = "Watch Time")

## Pairs

#Use matrix notation to select columns 2, 5, 6, and 7
D5 <- D1[,c(2,5,6,7)]
#Draw a matrix of plots for every combination of variables
pairs(D5)

## Part II

  1. Create a simulated data set containing 100 students, each with a score from 1-100 representing performance in an educational game. The scores should tend to cluster around 75. Also, each student should be given a classification that reflects one of four interest groups: sport, music, nature, literature.
#rnorm(100, 75, 15) creates a random sample with a mean of 75 and standard deviation of 20
#filter() can be used to set a maximum and minimum value
#round() rounds numbers to whole number values
#sample() draws a random samples from the groups vector according to a uniform distribution

#simulate data
score <- rnorm(100, 75, 15) 
hist(score,breaks=30)

S1 <-data.frame(score)

#replace the samples over 100 with 100
library(dplyr)
S1<-filter(S1,score<=100) 
hist(S1$score)

S1
##       score
## 1  80.87487
## 2  85.01307
## 3  88.43175
## 4  85.45668
## 5  62.00681
## 6  82.52423
## 7  78.15836
## 8  60.30607
## 9  91.58159
## 10 72.89160
## 11 95.22551
## 12 81.92356
## 13 85.42065
## 14 54.27170
## 15 62.60689
## 16 92.09013
## 17 75.18500
## 18 69.28725
## 19 71.33826
## 20 60.52950
## 21 67.73504
## 22 97.09658
## 23 84.62451
## 24 62.72625
## 25 77.73485
## 26 78.86311
## 27 70.83471
## 28 94.19442
## 29 97.80616
## 30 61.77682
## 31 90.20980
## 32 83.53310
## 33 81.62467
## 34 80.34869
## 35 74.04995
## 36 69.29995
## 37 92.54230
## 38 56.72008
## 39 85.40762
## 40 89.84091
## 41 62.66578
## 42 51.91075
## 43 76.73034
## 44 67.68292
## 45 70.69177
## 46 66.65772
## 47 54.52726
## 48 71.86613
## 49 51.99740
## 50 74.27187
## 51 67.59639
## 52 36.87490
## 53 70.11316
## 54 71.34340
## 55 82.24572
## 56 96.48479
## 57 54.51704
## 58 64.07890
## 59 58.50897
## 60 51.19677
## 61 79.92925
## 62 41.60717
## 63 69.61532
## 64 71.42719
## 65 76.39338
## 66 76.34355
## 67 86.97454
## 68 64.28295
## 69 87.46535
## 70 96.58134
## 71 72.91454
## 72 91.81606
## 73 72.25564
## 74 83.45995
## 75 53.88130
## 76 36.11727
## 77 76.89793
## 78 65.18067
## 79 93.68269
## 80 69.36480
## 81 66.06799
## 82 68.56357
## 83 40.61765
## 84 79.70890
## 85 89.67365
## 86 76.10254
## 87 60.10658
## 88 85.77869
## 89 72.40631
## 90 47.17927
## 91 58.39076
## 92 80.85854
## 93 77.88864
## 94 71.65038
## 95 71.31879
S2<- data.frame(rep(100,5)) # my sample has 5 scores above 100 
names(S2) 
## [1] "rep.100..5."
S3<- bind_rows(S1,S2) 
S3
##        score rep.100..5.
## 1   80.87487          NA
## 2   85.01307          NA
## 3   88.43175          NA
## 4   85.45668          NA
## 5   62.00681          NA
## 6   82.52423          NA
## 7   78.15836          NA
## 8   60.30607          NA
## 9   91.58159          NA
## 10  72.89160          NA
## 11  95.22551          NA
## 12  81.92356          NA
## 13  85.42065          NA
## 14  54.27170          NA
## 15  62.60689          NA
## 16  92.09013          NA
## 17  75.18500          NA
## 18  69.28725          NA
## 19  71.33826          NA
## 20  60.52950          NA
## 21  67.73504          NA
## 22  97.09658          NA
## 23  84.62451          NA
## 24  62.72625          NA
## 25  77.73485          NA
## 26  78.86311          NA
## 27  70.83471          NA
## 28  94.19442          NA
## 29  97.80616          NA
## 30  61.77682          NA
## 31  90.20980          NA
## 32  83.53310          NA
## 33  81.62467          NA
## 34  80.34869          NA
## 35  74.04995          NA
## 36  69.29995          NA
## 37  92.54230          NA
## 38  56.72008          NA
## 39  85.40762          NA
## 40  89.84091          NA
## 41  62.66578          NA
## 42  51.91075          NA
## 43  76.73034          NA
## 44  67.68292          NA
## 45  70.69177          NA
## 46  66.65772          NA
## 47  54.52726          NA
## 48  71.86613          NA
## 49  51.99740          NA
## 50  74.27187          NA
## 51  67.59639          NA
## 52  36.87490          NA
## 53  70.11316          NA
## 54  71.34340          NA
## 55  82.24572          NA
## 56  96.48479          NA
## 57  54.51704          NA
## 58  64.07890          NA
## 59  58.50897          NA
## 60  51.19677          NA
## 61  79.92925          NA
## 62  41.60717          NA
## 63  69.61532          NA
## 64  71.42719          NA
## 65  76.39338          NA
## 66  76.34355          NA
## 67  86.97454          NA
## 68  64.28295          NA
## 69  87.46535          NA
## 70  96.58134          NA
## 71  72.91454          NA
## 72  91.81606          NA
## 73  72.25564          NA
## 74  83.45995          NA
## 75  53.88130          NA
## 76  36.11727          NA
## 77  76.89793          NA
## 78  65.18067          NA
## 79  93.68269          NA
## 80  69.36480          NA
## 81  66.06799          NA
## 82  68.56357          NA
## 83  40.61765          NA
## 84  79.70890          NA
## 85  89.67365          NA
## 86  76.10254          NA
## 87  60.10658          NA
## 88  85.77869          NA
## 89  72.40631          NA
## 90  47.17927          NA
## 91  58.39076          NA
## 92  80.85854          NA
## 93  77.88864          NA
## 94  71.65038          NA
## 95  71.31879          NA
## 96        NA         100
## 97        NA         100
## 98        NA         100
## 99        NA         100
## 100       NA         100
#create the interest group variable in S3 data frame
interest <- c("sport","music","nature","literature")
S3$interest <- sample(interest, 100, replace = TRUE)
S3
##        score rep.100..5.   interest
## 1   80.87487          NA     nature
## 2   85.01307          NA     nature
## 3   88.43175          NA      music
## 4   85.45668          NA      sport
## 5   62.00681          NA literature
## 6   82.52423          NA      music
## 7   78.15836          NA     nature
## 8   60.30607          NA      music
## 9   91.58159          NA      music
## 10  72.89160          NA literature
## 11  95.22551          NA     nature
## 12  81.92356          NA literature
## 13  85.42065          NA      music
## 14  54.27170          NA     nature
## 15  62.60689          NA      music
## 16  92.09013          NA     nature
## 17  75.18500          NA      music
## 18  69.28725          NA     nature
## 19  71.33826          NA literature
## 20  60.52950          NA     nature
## 21  67.73504          NA      music
## 22  97.09658          NA      music
## 23  84.62451          NA     nature
## 24  62.72625          NA      sport
## 25  77.73485          NA literature
## 26  78.86311          NA      sport
## 27  70.83471          NA     nature
## 28  94.19442          NA literature
## 29  97.80616          NA literature
## 30  61.77682          NA      music
## 31  90.20980          NA     nature
## 32  83.53310          NA      sport
## 33  81.62467          NA      music
## 34  80.34869          NA      sport
## 35  74.04995          NA literature
## 36  69.29995          NA      music
## 37  92.54230          NA      music
## 38  56.72008          NA     nature
## 39  85.40762          NA      music
## 40  89.84091          NA     nature
## 41  62.66578          NA      music
## 42  51.91075          NA     nature
## 43  76.73034          NA      music
## 44  67.68292          NA     nature
## 45  70.69177          NA      music
## 46  66.65772          NA literature
## 47  54.52726          NA     nature
## 48  71.86613          NA      music
## 49  51.99740          NA     nature
## 50  74.27187          NA     nature
## 51  67.59639          NA      sport
## 52  36.87490          NA literature
## 53  70.11316          NA      music
## 54  71.34340          NA      music
## 55  82.24572          NA      sport
## 56  96.48479          NA      sport
## 57  54.51704          NA      sport
## 58  64.07890          NA     nature
## 59  58.50897          NA literature
## 60  51.19677          NA literature
## 61  79.92925          NA      music
## 62  41.60717          NA      sport
## 63  69.61532          NA      sport
## 64  71.42719          NA      sport
## 65  76.39338          NA      music
## 66  76.34355          NA      sport
## 67  86.97454          NA      sport
## 68  64.28295          NA      sport
## 69  87.46535          NA      music
## 70  96.58134          NA     nature
## 71  72.91454          NA     nature
## 72  91.81606          NA      sport
## 73  72.25564          NA literature
## 74  83.45995          NA literature
## 75  53.88130          NA      sport
## 76  36.11727          NA      music
## 77  76.89793          NA      sport
## 78  65.18067          NA     nature
## 79  93.68269          NA      music
## 80  69.36480          NA literature
## 81  66.06799          NA      sport
## 82  68.56357          NA      sport
## 83  40.61765          NA      sport
## 84  79.70890          NA     nature
## 85  89.67365          NA     nature
## 86  76.10254          NA      music
## 87  60.10658          NA literature
## 88  85.77869          NA      music
## 89  72.40631          NA     nature
## 90  47.17927          NA     nature
## 91  58.39076          NA      music
## 92  80.85854          NA      music
## 93  77.88864          NA literature
## 94  71.65038          NA literature
## 95  71.31879          NA      sport
## 96        NA         100      music
## 97        NA         100     nature
## 98        NA         100      sport
## 99        NA         100     nature
## 100       NA         100      sport
  1. Using base R commands, draw a histogram of the scores. Change the breaks in your histogram until you think they best represent your data.
hist(S3$score,breaks=30)

  1. Create a new variable that groups the scores according to the breaks in your histogram.
#cut() divides the range of scores into intervals and codes the values in scores according to which interval they fall. We use a vector called `letters` as the labels, `letters` is a vector made up of the letters of the alphabet.
label <-letters[1:10] 
S3$breaks <-cut(S3$score,breaks=10,labels=label) 
  1. Now using the colorbrewer package (RColorBrewer; http://colorbrewer2.org/#type=sequential&scheme=BuGn&n=3) design a pallette and assign it to the groups in your data on the histogram.
library(RColorBrewer)
#Let's look at the available palettes in RColorBrewer

#The top section of palettes are sequential, the middle section are qualitative, and the lower section are diverging.
#Make RColorBrewer palette available to R and assign to your bins

#Use named palette in histogram

S3$colors <- brewer.pal(10, "Set3")
hist(S3$score,col=S3$colors)

  1. Create a boxplot that visualizes the scores for each interest group and color each interest group a different color.
#Make a vector of the colors from RColorBrewer
interest.col <-brewer.pal(4,"Dark2")
boxplot(score ~ interest, S3,col=interest.col)

  1. Now simulate a new variable that describes the number of logins that students made to the educational game. They should vary from 1-25.
S3$login <- sample(1:25, 100, replace = TRUE)
  1. Plot the relationships between logins and scores. Give the plot a title and color the dots according to interest group.
plot(S3$login, S3$score, col=S3$colors, main="Student Logins vs. Scores")

S3$col1 <- ifelse(S3$interest == "music","red","green")
  1. R contains several inbuilt data sets, one of these in called AirPassengers. Plot a line graph of the the airline passengers over time using this data set.
AP<- data.frame(AirPassengers)
plot(AirPassengers)

  1. Using another inbuilt data set, iris, plot the relationships between all of the variables in the data set. Which of these relationships is it appropraiet to run a correlation on?
IR<- data.frame(iris)
pairs(IR)

The pairs function creates a set of plots for every combination of variables in the dataset, and we can see there are some plots demonstrate the patterns of correlation for some variables. We can further investigate the correlations between the following variable pairs:Sepal.Length by Sepal. Width; Sepal.Length by Pental. Length; Sepal.Length by Pental. Width; Sepal Width by Petal Length; Sepal.Width by Petal width; Petal. Length by Petal. Width.All of these pairs seem to have some sort of correlations.But if only one correlation we need to choose to further investigate, Petal Length by Pental Width seems the most appropriate one to correlate on.

Part III - Analyzing Swirl

Data

In this repository you will find data describing Swirl activity from the class so far this semester. Please connect RStudio to this repository.

Instructions

  1. Insert a new code block
  2. Create a data frame from the swirl-data.csv file called DF1

The variables are:

course_name - the name of the R course the student attempted
lesson_name - the lesson name
question_number - the question number attempted correct - whether the question was answered correctly
attempt - how many times the student attempted the question
skipped - whether the student skipped the question
datetime - the date and time the student attempted the question
hash - anonymyzed student ID

DF1 <- read.csv("swirl-data.csv",header = TRUE)
View(DF1)
  1. Create a new data frame that only includes the variables hash, lesson_name and attempt called DF2
DF2 <- DF1[,c(2,5,8)] 
View(DF2)
  1. Use the group_by function to create a data frame that sums all the attempts for each hash by each lesson_name called DF3
DF3 <- DF2 %>% group_by(hash, lesson_name)%>% summarise(sum_attempts = sum(attempt))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
View(DF3)
  1. On a scrap piece of paper draw what you think DF3 would look like if all the lesson names were column names DF3 sketch

  2. Convert DF3 to this format

DF3_wide <- spread(DF3,lesson_name,sum_attempts)
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
## Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
View(DF3_wide)
  1. Create a new data frame from DF1 called DF4 that only includes the variables hash, lesson_name and correct
DF4 <- DF1[,c(2,4,8)] 
View(DF4)
  1. Convert the correct variable so that TRUE is coded as the number 1 and FALSE is coded as 0
DF4$correct<- ifelse(DF4$correct=="TRUE",1,0)
View(DF4)
  1. Create a new data frame called DF5 that provides a mean score for each student on each course
DF5 <- DF1[,c(1,4,8)] #create a new data frame with the variable "course_name","hash" and "correct"
DF5$correct<- ifelse(DF5$correct=="TRUE",1,0)
DF5$correct[is.na(DF5$correct)]<-0 #replace all the NA with 0
DF5 <- DF5%>% group_by(hash,course_name)%>%summarise(mean_score = mean(correct))
## `summarise()` regrouping output by 'hash' (override with `.groups` argument)
View(DF5)
  1. Extra credit Convert the datetime variable into month-day-year format and create a new data frame (DF6) that shows the average correct for each day
DF6 <- DF1[,c(4,7)]
library(anytime)
DF6$datetime <- anytime(DF6$datetime)
DF6$datetime <- format(as.Date(DF6$datetime), "%m-%d-%Y")
DF6$correct<- ifelse(DF6$correct=="TRUE",1,0)
DF6$correct[is.na(DF6$correct)]<-0 #replace all the NA with 0
DF6 <- DF6%>% group_by(datetime)%>%summarise(mean_correct = mean(correct))
## `summarise()` ungrouping output (override with `.groups` argument)
View(DF6)

Finally use the knitr function to generate an html document from your work. Commit, Push and Pull Request your work back to the main branch of the repository. Make sure you include both the .Rmd file and the .html file.